Link to kaggle, where we found our data set
lm_spec <-
linear_reg() %>%
set_engine(engine = 'lm') %>%
set_mode('regression')
lm_rec <- recipe(median_house_value ~ ., data = housing) %>%
step_normalize(all_numeric_predictors())
lm_wf <- workflow() %>%
add_recipe(lm_rec) %>%
add_model(lm_spec)
housing_lm_wf_1 <- workflow() %>%
add_recipe(lm_rec) %>%
add_model(lm_spec)%>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_nzv(all_predictors())
housing_lm_wf_2 <- workflow() %>%
add_formula(median_house_value ~ median_income)%>%
add_model(lm_spec)%>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_nzv(all_predictors())
housing_lm_wf_3 <- workflow() %>%
add_formula(median_house_value ~ median_income + longitude)%>%
add_model(lm_spec)%>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_nzv(all_predictors())
housing_lm_wf_4 <- workflow() %>%
add_formula(median_house_value ~ median_income + longitude + latitude)%>%
add_model(lm_spec)%>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_nzv(all_predictors())
housing_lm_wf_5 <- workflow() %>%
add_formula(median_house_value ~ median_income + longitude + latitude + total_rooms)%>%
add_model(lm_spec)%>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_nzv(all_predictors())
housing_lm_wf_6 <- workflow() %>%
add_formula(median_house_value ~ median_income + longitude + latitude + total_rooms + population)%>%
add_model(lm_spec)%>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_nzv(all_predictors())
mod1 <- fit(lm_spec,
median_house_value ~ .,
data = housing)
mod2 <- fit(lm_spec,
median_house_value ~ median_income,
data = housing)
mod3 <- fit(lm_spec,
median_house_value ~ median_income + longitude,
data = housing)
mod4 <- fit(lm_spec,
median_house_value ~ median_income + longitude + latitude,
data = housing)
mod5 <- fit(lm_spec,
median_house_value ~ median_income + longitude + latitude + total_rooms,
data = housing)
mod6 <- fit(lm_spec,
median_house_value ~ median_income + longitude + latitude + total_rooms + population,
data = housing)
mod1 %>%
tidy() %>%
slice(-1) %>%
mutate(lower = estimate - 1.96*std.error, upper = estimate + 1.96*std.error) %>%
ggplot() +
geom_vline(xintercept=0, linetype=4) +
geom_point(aes(x=estimate, y=term)) +
geom_segment(aes(y=term, yend=term, x=lower, xend=upper),
arrow = arrow(angle=90, ends='both', length = unit(0.1, 'cm'))) +
labs(x = 'Coefficient estimate (95% CI)', y = 'Feature') +
theme_classic()
mod1 %>%
tidy()
## # A tibble: 13 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -2273908. 88456. -25.7 1.88e-143
## 2 longitude -26813. 1020. -26.3 6.60e-150
## 3 latitude -25482. 1005. -25.4 9.39e-140
## 4 housing_median_age 1073. 43.9 24.4 4.85e-130
## 5 total_rooms -6.19 0.791 -7.83 5.32e- 15
## 6 total_bedrooms 101. 6.87 14.6 2.73e- 48
## 7 population -38.0 1.08 -35.3 9.35e-265
## 8 households 49.6 7.45 6.66 2.83e- 11
## 9 median_income 39260. 338. 116. 0
## 10 ocean_proximity2 3954. 1913. 2.07 3.88e- 2
## 11 ocean_proximity3 8232. 2176. 3.78 1.55e- 4
## 12 ocean_proximity4 156856. 30778. 5.10 3.49e- 7
## 13 ocean_proximity5 -35330. 2336. -15.1 2.23e- 51
mod1_output <- mod1 %>%
predict(new_data = housing) %>%
bind_cols(housing) %>%
mutate(resid = median_house_value - .pred)
mod2_output <- mod2 %>%
predict(new_data = housing) %>%
bind_cols(housing) %>%
mutate(resid = median_house_value - .pred)
mod3_output <- mod3 %>%
predict(new_data = housing) %>%
bind_cols(housing) %>%
mutate(resid = median_house_value - .pred)
mod4_output <- mod4 %>%
predict(new_data = housing) %>%
bind_cols(housing) %>%
mutate(resid = median_house_value - .pred)
mod5_output <- mod5 %>%
predict(new_data = housing) %>%
bind_cols(housing) %>%
mutate(resid = median_house_value - .pred)
mod6_output <- mod6 %>%
predict(new_data = housing) %>%
bind_cols(housing) %>%
mutate(resid = median_house_value - .pred)
mod1_output %>%
mae(truth = median_house_value, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 mae standard 49748.
mod2_output %>%
mae(truth = median_house_value, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 mae standard 62598.
mod3_output %>%
mae(truth = median_house_value, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 mae standard 62486.
mod4_output %>%
mae(truth = median_house_value, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 mae standard 54807.
mod5_output %>%
mae(truth = median_house_value, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 mae standard 54780.
mod6_output %>%
mae(truth = median_house_value, estimate = .pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 mae standard 53892.
ggplot(mod1_output, aes(y=resid, x=median_house_value)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
#not very useful?
ggplot(mod1_output, aes(y=resid, x=housing_median_age)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
#not very useful
ggplot(mod1_output, aes(y=resid, x=ocean_proximity)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod1_output, aes(y=resid, x=longitude)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod1_output, aes(y=resid, x=latitude)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod1_output, aes(y=resid, x=total_rooms)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod1_output, aes(y=resid, x=median_income)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod1_output, aes(y=resid, x=total_bedrooms)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod2_output, aes(y=resid, x=total_rooms)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod2_output, aes(y=resid, x=median_house_value)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod2_output, aes(y=resid, x=longitude)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod2_output, aes(y=resid, x=latitude)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod2_output, aes(y=resid, x=median_income)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod2_output, aes(y=resid, x=population)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod3_output, aes(y=resid, x=population)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod3_output, aes(y=resid, x=total_rooms)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(mod6_output, aes(y=resid, x=longitude)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
mod1_cv <- fit_resamples(housing_lm_wf_1,
resamples = data_cv10,
metrics = metric_set(rmse, rsq, mae))
mod2_cv <- fit_resamples(housing_lm_wf_2,
resamples = data_cv10,
metrics = metric_set(rmse, rsq, mae))
mod3_cv <- fit_resamples(housing_lm_wf_3,
resamples = data_cv10,
metrics = metric_set(rmse, rsq, mae))
mod4_cv <- fit_resamples(housing_lm_wf_4,
resamples = data_cv10,
metrics = metric_set(rmse, rsq, mae))
mod5_cv <- fit_resamples(housing_lm_wf_5,
resamples = data_cv10,
metrics = metric_set(rmse, rsq, mae))
mod6_cv <- fit_resamples(housing_lm_wf_6,
resamples = data_cv10,
metrics = metric_set(rmse, rsq, mae))
mod1_cv %>% unnest(.metrics) %>%
filter(.metric == 'rmse') %>%
summarize(RMSE_CV = mean(.estimate))
## # A tibble: 1 × 1
## RMSE_CV
## <dbl>
## 1 68711.
mod2_cv %>% unnest(.metrics) %>%
filter(.metric == 'rmse') %>%
summarize(RMSE_CV = mean(.estimate))
## # A tibble: 1 × 1
## RMSE_CV
## <dbl>
## 1 83701.
mod3_cv %>% unnest(.metrics) %>%
filter(.metric == 'rmse') %>%
summarize(RMSE_CV = mean(.estimate))
## # A tibble: 1 × 1
## RMSE_CV
## <dbl>
## 1 83610.
mod4_cv %>% unnest(.metrics) %>%
filter(.metric == 'rmse') %>%
summarize(RMSE_CV = mean(.estimate))
## # A tibble: 1 × 1
## RMSE_CV
## <dbl>
## 1 74421.
mod5_cv %>% unnest(.metrics) %>%
filter(.metric == 'rmse') %>%
summarize(RMSE_CV = mean(.estimate))
## # A tibble: 1 × 1
## RMSE_CV
## <dbl>
## 1 74386.
mod6_cv %>% unnest(.metrics) %>%
filter(.metric == 'rmse') %>%
summarize(RMSE_CV = mean(.estimate))
## # A tibble: 1 × 1
## RMSE_CV
## <dbl>
## 1 73070.
mod1_cv %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 mae standard 49798. 10 435. Preprocessor1_Model1
## 2 rmse standard 68711. 10 798. Preprocessor1_Model1
## 3 rsq standard 0.646 10 0.00588 Preprocessor1_Model1
mod2_cv %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 mae standard 62609. 10 532. Preprocessor1_Model1
## 2 rmse standard 83701. 10 888. Preprocessor1_Model1
## 3 rsq standard 0.474 10 0.00782 Preprocessor1_Model1
mod3_cv %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 mae standard 62503. 10 553. Preprocessor1_Model1
## 2 rmse standard 83610. 10 898. Preprocessor1_Model1
## 3 rsq standard 0.475 10 0.00798 Preprocessor1_Model1
mod4_cv %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 mae standard 54826. 10 478. Preprocessor1_Model1
## 2 rmse standard 74421. 10 791. Preprocessor1_Model1
## 3 rsq standard 0.584 10 0.00545 Preprocessor1_Model1
mod5_cv %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 mae standard 54802. 10 474. Preprocessor1_Model1
## 2 rmse standard 74386. 10 792. Preprocessor1_Model1
## 3 rsq standard 0.585 10 0.00552 Preprocessor1_Model1
mod6_cv %>% collect_metrics()
## # A tibble: 3 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 mae standard 53916. 10 442. Preprocessor1_Model1
## 2 rmse standard 73070. 10 799. Preprocessor1_Model1
## 3 rsq standard 0.599 10 0.00578 Preprocessor1_Model1
lasso_spec <-
linear_reg() %>%
set_args(mixture = 1, penalty = tune()) %>% ## mixture = 1 indicates Lasso, we'll choose penalty later
set_engine(engine = 'glmnet') %>% #note we are using a different engine
set_mode('regression')
lasso_rec <- recipe(median_house_value ~ ., data = housing) %>%
step_nzv(all_predictors()) %>% # removes variables with the same value
#step_novel(all_nominal_predictors()) %>% # important if you have rare categorical variables
step_normalize(all_numeric_predictors()) %>% # important standardization step for LASSO
step_dummy(all_nominal_predictors()) # creates indicator variables for categorical variables
lasso_wf <- workflow() %>%
add_recipe(lasso_rec) %>%
add_model(lasso_spec)
# Tune LASSO #1
penalty_grid <- grid_regular(
penalty(range = c(-5,5)), #log10 transformed 10^-5 to 10^3
levels = 30)
# Tune LASSO #2
tune_res <- tune_grid( # new function for tuning parameters
lasso_wf, # workflow
resamples = data_cv10, # cv folds
metrics = metric_set(rmse, mae),
grid = penalty_grid)
# LASSO model
lasso_fit <- lasso_wf %>%
fit(data = housing) # Fit to data
# plotting LASSO lambda
plot(lasso_fit %>%
extract_fit_parsnip() %>%
pluck('fit'), # way to get the original glmnet output
xvar = "lambda")
# Visualize LASSO Metrics from Tuning
autoplot(tune_res) + theme_classic()
# Summarize LASSO CV Metrics
collect_metrics(tune_res) %>%
filter(.metric == 'mae') %>% # or choose mae
select(penalty, mae = mean)
## # A tibble: 30 × 2
## penalty mae
## <dbl> <dbl>
## 1 0.00001 49791.
## 2 0.0000221 49791.
## 3 0.0000489 49791.
## 4 0.000108 49791.
## 5 0.000240 49791.
## 6 0.000530 49791.
## 7 0.00117 49791.
## 8 0.00259 49791.
## 9 0.00574 49791.
## 10 0.0127 49791.
## # … with 20 more rows
# choose penalty value based on lowest mae or rmse
best_penalty <- select_best(tune_res, metric = 'mae')
best_penalty
## # A tibble: 1 × 2
## penalty .config
## <dbl> <chr>
## 1 78.8 Preprocessor1_Model21
# choose penalty value based on the largest penalty within 1 se of the lowest CV MAE
best_se_penalty <- select_by_one_std_err(tune_res, metric = 'mae', desc(penalty))
# Fit Final LASSO Models
final_wf <- finalize_workflow(lasso_wf, best_penalty) # incorporates penalty value to workflow
final_wf_se <- finalize_workflow(lasso_wf, best_se_penalty) # incorporates penalty value to workflow
final_fit <- fit(final_wf, data = housing)
final_fit_se <- fit(final_wf_se, data = housing)
tidy(final_fit)
## # A tibble: 13 × 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) 216740. 78.8
## 2 longitude -50809. 78.8
## 3 latitude -51528. 78.8
## 4 housing_median_age 13453. 78.8
## 5 total_rooms -11697. 78.8
## 6 total_bedrooms 40214. 78.8
## 7 population -42467. 78.8
## 8 households 18813. 78.8
## 9 median_income 74149. 78.8
## 10 ocean_proximity_X2 2872. 78.8
## 11 ocean_proximity_X3 7510. 78.8
## 12 ocean_proximity_X4 152032. 78.8
## 13 ocean_proximity_X5 -38212. 78.8
tidy(final_fit_se)
## # A tibble: 13 × 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) 223724. 853.
## 2 longitude -28950. 853.
## 3 latitude -28678. 853.
## 4 housing_median_age 13219. 853.
## 5 total_rooms 0 853.
## 6 total_bedrooms 26949. 853.
## 7 population -36168. 853.
## 8 households 14685. 853.
## 9 median_income 71425. 853.
## 10 ocean_proximity_X2 0 853.
## 11 ocean_proximity_X3 6385. 853.
## 12 ocean_proximity_X4 109045. 853.
## 13 ocean_proximity_X5 -55699. 853.
lasso_mod_out <- final_fit_se %>%
predict(new_data = housing) %>%
bind_cols(housing) %>%
mutate(resid = median_house_value - .pred)
lasso_mod_out %>%
ggplot(aes(x = .pred, y = resid)) +
geom_point() +
geom_smooth(se = FALSE) +
geom_hline(yintercept = 0) +
theme_classic()
\(~\)
C.Compare estimated test performance across the models. Which models(s) might you prefer?
\(~\)
D. Use residual plots to evaluate whether some quantitative predictors might be better modeled with nonlinear relationships.
\(~\)
E. Which variables do you think are the most important predictors of your quantitative outcome? Justify your answer. Do the methods you’ve applied reach consensus on which variables are most important? What insights are expected? Surprising?
Decide on an overall best model based on your investigations so far. To do this, make clear your analysis goals. Predictive accuracy? Interpretability? A combination of both?
The model that contains all of the variables in the data set is the best model so far. Based on the fact that that model yielded the lowest CV test error rate, it seems like taking into account of house age, number of rooms, location, ocean proximity, total bedrooms, population, and income level of that block of houses is important to determine house value.
The interpretability of our model is pretty staright forward, though we would like to find out if some individual variables could have more influence on house value and make predictions with that variable.
\(~\) \(~\)
Are there any harms that may come from your analyses and/or how the data were collected? What cautions do you want to keep in mind when communicating your work?
The variable median_income measures the median income for households within a block of houses. Since this variable seems to be the most persistent in our model, according to our LASSO results, this confirms the seriously large wage gap in the United States. Income affects one’s social status, health care access, and even house value. We could further infer from our preliminary analysis that income affects house value because it is more likely that those who have a higher income are not criminals. Where criminal activity is low, housing options there attracts more buyers — especially buyers of higher social status.
The data was derived from Aurélien Géron’s book ‘Hands-On Machine learning with Scikit-Learn and TensorFlow’. It contains the 1990 California census information, specifically on housing. We don’t suspect that the way in which the data was collected is harmful. But we do need to acknowledge that those who did respond to census letters are probably those who have a mailing address and those who are middle to higher income groups. Back in the days, I don’t think there census surveys are delivered eletronically and therefore data collection must have had missed people who do not have a mailing address or P.O. box. Additionally, people who are middle to higher income groups would more likely respond to the census because they can read and write English, they might be more informed about the purpose of a census survey, and they might just have more time to attend to a list of questions printed on multiple pages of paper. People who had lower income and probably had to work so many more hours just to meet ends and the census letter could be the last on their minds. And keep in mind that the data set is specifically on California housing, which is arguably one of the more wealthy and liberal states in the US.
step_ns() for each quantitative predictor that you want to allow to be non-linear.deg_free), I recommend fitting a smoothing spline and use edf to inform your choice.# Linear Regression Model Spec
lm_spec <-
linear_reg() %>%
set_engine(engine = 'lm') %>%
set_mode('regression')
lm_rec <- recipe(median_house_value ~ ., data = housing)
ns_rec <- lm_rec %>%
step_ns(longitude, deg_free = 10) %>%
step_ns(latitude, deg_free = 10) %>%
step_ns(housing_median_age, deg_free = 10) %>%
step_ns(total_rooms, deg_free = 10) %>%
step_ns(total_bedrooms, deg_free = 10) %>%
step_ns(households, deg_free = 10) %>%
step_ns(population, deg_free = 10)
#degrees of freedom has to be a whole number and larger than edf
ns_wf <- workflow() %>%
add_model(lm_spec) %>%
add_recipe(ns_rec)
data_cv10<- vfold_cv(housing, v = 10)
cv_output <- fit_resamples(
ns_wf,
resamples = data_cv10, # cv folds
metrics = metric_set(mae))
cv_output %>% collect_metrics()
## # A tibble: 1 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 mae standard 44580. 10 222. Preprocessor1_Model1
# Fit with all data
# ns_mod <- fit(
# lm_spec, #workflow
# data = housing)
\(~\)
gam_spec <-
gen_additive_mod() %>%
set_engine(engine = 'mgcv') %>%
set_mode('regression')
gam_mod <- fit(gam_spec,
median_house_value ~ ocean_proximity + s(longitude) + s(latitude) +
s(housing_median_age) + s(total_rooms) + s(total_bedrooms) +
s(population) + s(households) + s(median_income),
data = housing)
\(~\)
par(mfrow=c(2,2))
gam_mod %>% pluck('fit') %>% mgcv::gam.check()
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 14 iterations.
## The RMS GCV score gradient at convergence was 269.7354 .
## The Hessian was positive definite.
## Model rank = 77 / 77
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(longitude) 9.00 8.95 0.89 <2e-16 ***
## s(latitude) 9.00 8.90 0.90 <2e-16 ***
## s(housing_median_age) 9.00 8.86 1.01 0.700
## s(total_rooms) 9.00 7.00 1.00 0.385
## s(total_bedrooms) 9.00 9.00 1.01 0.875
## s(population) 9.00 8.17 0.94 <2e-16 ***
## s(households) 9.00 4.95 1.01 0.690
## s(median_income) 9.00 8.15 0.97 0.005 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
gam_mod %>% pluck('fit') %>% summary()
##
## Family: gaussian
## Link function: identity
##
## Formula:
## median_house_value ~ ocean_proximity + s(longitude) + s(latitude) +
## s(housing_median_age) + s(total_rooms) + s(total_bedrooms) +
## s(population) + s(households) + s(median_income)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 211974 1968 107.712 < 2e-16 ***
## ocean_proximity2 -3028 2270 -1.334 0.182359
## ocean_proximity3 21981 2460 8.936 < 2e-16 ***
## ocean_proximity4 103433 27675 3.737 0.000186 ***
## ocean_proximity5 -20833 2664 -7.820 5.54e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(longitude) 8.950 8.999 130.752 <2e-16 ***
## s(latitude) 8.901 8.997 146.490 <2e-16 ***
## s(housing_median_age) 8.862 8.993 64.501 <2e-16 ***
## s(total_rooms) 7.004 8.188 8.413 <2e-16 ***
## s(total_bedrooms) 9.000 9.000 39.516 <2e-16 ***
## s(population) 8.174 8.754 333.367 <2e-16 ***
## s(households) 4.951 6.171 28.213 <2e-16 ***
## s(median_income) 8.151 8.804 1179.025 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.718 Deviance explained = 71.9%
## GCV = 3.7731e+09 Scale est. = 3.7604e+09 n = 20433
\(~\)
\(~\)
<<<<<<< HEAD - the GAM output plots show us that total_rooms, total_bedrooms, households, and population become less accurate in predicting near the upper ends. Population seems to be the least accurate in predicting using smoothing splines. ======= - The GAM output plots show us that total_rooms, total_bedrooms, households, and population become less accurate in predicting near the upper ends. Population seems to be the least accurate in predicting using smoothing splines. >>>>>>> 2f4df9723325062f19aa1e62eeb1be1cedd684aa
gam_mod %>% pluck('fit') %>% plot(page = 1)
\(~\)
\(~\)
Below are the MAEs yeilded from our models thus far: 1) OLS: 49797.67 2) LASSO: 49791.43
3) GAMs: 44580.23
Decide on an overall best model based on your investigations so far. To do this, make clear your analysis goals. Predictive accuracy? Interpretability? A combination of both?
Are there any harms that may come from your analyses and/or how the data were collected?
\(~\)
What cautions do you want to keep in mind when communicating your work?
\(~\)
\(~\)
Keep in mind that the final project will require you to complete the pieces below. Use this as a guide for your work but don’t try to accomplish everything for HW4:
Classification - Methods - Indicate at least 2 different methods used to answer your classification research question. - Describe what you did to evaluate the models explored. - Indicate how you estimated quantitative evaluation metrics. - Describe the goals / purpose of the methods used in the overall context of your research investigations.
Classification - Results - Summarize your final model and justify your model choice (see below for ways to justify your choice). - Compare the different classification models tried in light of evaluation metrics, variable importance, and data context. - Display evaluation metrics for different models in a clean, organized way. This display should include both the estimated metric as well as its standard deviation. (This won’t be available from OOB error estimation. If using OOB, don’t worry about reporting the SD.) - Broadly summarize conclusions from looking at these evaluation metrics and their measures of uncertainty.
Classification - Conclusions - Interpret evaluation metric(s) for the final model in context. Does the model show an acceptable amount of error? - If using OOB error estimation, display the test (OOB) confusion matrix, and use it to interpret the strengths and weaknesses of the final model. - Summarization should show evidence of acknowledging the data context in thinking about the sensibility of these results.
Set up K-Means by selecting variables that had the most influence based on previous data analysis (i.e., HW2)
housing_popIncome <- housing %>%
select(population, median_income)
housing_bedroomAge <- housing %>%
select(housing_median_age, total_bedrooms)
set.seed(620)
\(~\)
Create kclust variable for median income and population and graph it
kclust_k3 <- kmeans(housing_popIncome, centers = 3)
kclust_k3_scale <- kmeans(scale(housing_popIncome), centers = 3)
housing_PI <- housing %>%
mutate(kclust_3 = factor(kclust_k3$cluster)) %>%
mutate(kclust_3_scale = factor(kclust_k3_scale$cluster))
ggplot(housing_PI, aes(x = population, y = median_income, color = kclust_3)) +
geom_point() +
labs(title = "K-Means clustering on median income and population") +
theme_classic()
ggplot(housing_PI, aes(x = population, y = median_income, color = kclust_3_scale)) +
geom_point() +
labs(title = "K-Means with scaled clustering on median income and population") +
theme_classic()
summary(housing_popIncome)
## population median_income
## Min. : 3 Min. : 0.4999
## 1st Qu.: 787 1st Qu.: 2.5637
## Median : 1166 Median : 3.5365
## Mean : 1425 Mean : 3.8712
## 3rd Qu.: 1722 3rd Qu.: 4.7440
## Max. :35682 Max. :15.0001
\(~\)
kclust_k3_BA <- kmeans(housing_bedroomAge, centers = 3)
kclust_k3_scale_BA <- kmeans(scale(housing_bedroomAge), centers = 3)
housing_BA <- housing %>%
mutate(kclust_3_BA = factor(kclust_k3_BA$cluster)) %>%
mutate(kclust_3_scale_BA = factor(kclust_k3_scale_BA$cluster))
ggplot(housing_BA, aes(x = total_bedrooms, y = housing_median_age, color = kclust_3_BA)) +
geom_point() +
labs(title = "K-Means clustering on number of bedrooms and house median age") +
theme_classic()
ggplot(housing_BA, aes(x = total_bedrooms, y = housing_median_age, color = kclust_3_scale_BA)) +
geom_point() +
labs(title = "K-Means with scaled clustering on number of bedrooms and house median age") +
theme_classic()
summary(housing_bedroomAge)
## housing_median_age total_bedrooms
## Min. : 1.00 Min. : 1.0
## 1st Qu.:18.00 1st Qu.: 296.0
## Median :29.00 Median : 435.0
## Mean :28.63 Mean : 537.9
## 3rd Qu.:37.00 3rd Qu.: 647.0
## Max. :52.00 Max. :6445.0
\(~\)
Interpret K-Means results
Between the two K-Means Clustering, the two most persistent variables, median income and population, does not seem to relate to one another when we look at their clusters. Median income may increase, so does population, but that does not necessarily mean that houses in those categories have higher predicted value.
On the contrary, number of bedrooms and median house age seem to relate to one another more. Houses that are newer have more bedrooms and have a pretty good house value.
housing_PI %>%
group_by(kclust_3_scale) %>%
summarize(across(c(median_income, population, median_house_value), mean))
housing_BA %>%
group_by(kclust_3_scale) %>%
summarize(across(c(housing_median_age, total_bedrooms, median_house_value), mean))
# Make sure you understand what each line of code is doing
set.seed(123) # don't change this
data_fold <- vfold_cv(housing, v = 10)
ct_spec_tune <- decision_tree() %>%
set_engine(engine = 'rpart') %>%
set_args(cost_complexity = tune(),
min_n = 2,
tree_depth = NULL) %>%
set_mode('classification')
data_rec <- recipe(ocean_proximity ~ ., data = housing)
data_wf_tune <- workflow() %>%
add_model(ct_spec_tune) %>%
add_recipe(data_rec)
param_grid <- grid_regular(cost_complexity(range = c(-5, 1)), levels = 10)
tune_res <- tune_grid(
data_wf_tune,
resamples = data_fold,
grid = param_grid,
metrics = metric_set(accuracy) #change this for regression trees
)
best_complexity <- select_by_one_std_err(tune_res, metric = 'accuracy', desc(cost_complexity))
data_wf_final <- finalize_workflow(data_wf_tune, best_complexity)
housing_final_fit <- fit(data_wf_final, data = housing)
tune_res %>%
collect_metrics() %>%
filter(cost_complexity == best_complexity$cost_complexity)
## # A tibble: 1 × 7
## cost_complexity .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.0000464 accuracy multiclass 0.978 10 0.000875 Preprocessor1_Model02
housing_final_fit %>% extract_fit_engine() %>% rpart.plot()